---
title: "PAPEA Pipeline | Part 2"
author: "Sebastian Haunss, Priska Daphi, Jan Matti Dollbaum, Lidiya Hristova, Pál Susánszky, Elias Steinhilper"
date: "2025-02-14"
output: 
  html_document:
    theme: cerulean
    toc: yes
    toc_float:
      collapsed: true
---

```{r setup, include=FALSE}
knitr::opts_chunk$set(echo = TRUE)
```

# Date classification and consolidation of data

This script performs date classification and combines the predictions for form, claim, location, and date in one unified event file.

## 1. Date classification

It has been developed for German newspaper data, only, and needs to be localized for other languages. As a more general alternative ***Spacy*** date detection can be used.
The advantage of the customized sprit is that it runs much faster than Spacy and can be easily customized according to project needs.

```{r libraries}
library(stringr)
library(tidyverse)
library(kableExtra)
```

### Define functions

```{r functions}
# functions
shift_left <- function(row) {
  c(row[1], sort(row[-1], na.last = TRUE))  # Keep 'id' fixed, sort other values
}

Mode <- function(x) {
  x <- na.omit(x)   
  ux <- unique(x)
  tab <- tabulate(match(x, ux))
  most_frequent <- ux[tab == max(tab)]
  return(most_frequent)
}

month_to_date<- function(x){
  str_replace_all(x, c(Januar="01", Februar="02", März="03", April="04", Mai="05", Juni="06", Juli="07",August="08", September="09", Oktober="10", November="11", Dezember="12"))
}
```

### Reading and merging the datasets

```{r input data}
# reading and merging the datasets
d1 <- read.csv("../data/taz2015_sample_relevant_sentences.csv") %>% 
  mutate(aid = as.character(aid))
d2 <- read.csv("../data/taz2015_sample_relevant.csv") %>% 
  mutate(aid = as.character(aid)) %>% 
  select(aid, date)

data <- left_join(d1, d2, by = "aid")
```



### Do actual date detection

```{r date detection}
# Weekdays
# text elements referring to time.
data$wochenende<- as.numeric(str_detect(data$sentence, "Wochenende") & (!str_detect(data$sentence, "Samstag|Sonntag|Sonnabend")))

data$montag<- as.numeric(str_detect(data$sentence, "Montag"))
data$dienstag<- as.numeric(str_detect(data$sentence, "Dienstag"))
data$mittwoch<- as.numeric(str_detect(data$sentence, "Mittwoch"))
data$donnerstag<- as.numeric(str_detect(data$sentence, "Donnerstag"))
data$freitag<- as.numeric(str_detect(data$sentence, "Freitag"))
data$samstag<- as.numeric(str_detect(data$sentence, "Samstag|Sonnabend"))
data$sonntag<- as.numeric(str_detect(data$sentence, "Sonntag"))

data$tag_multi<- apply(data[c("montag", "dienstag", "mittwoch", "donnerstag", "freitag", "samstag", "sonntag")],1,sum)

data$tag_only_one<- 0
data$tag_only_one[data$tag_multi==1 & data$montag==1]<- 1
data$tag_only_one[data$tag_multi==1 & data$dienstag==1]<- 2
data$tag_only_one[data$tag_multi==1 & data$mittwoch==1]<- 3
data$tag_only_one[data$tag_multi==1 & data$donnerstag==1]<- 4
data$tag_only_one[data$tag_multi==1 & data$freitag==1]<- 5
data$tag_only_one[data$tag_multi==1 & data$samstag==1]<- 6
data$tag_only_one[data$tag_multi==1 & data$sonntag==1]<- 7
data$tag_only_one[data$tag_only_one==0]<- NA

data$heute<- as.numeric(str_detect(data$sentence, "Heute|heute|heutige"))
data$morgen<- as.numeric(str_detect(data$sentence, "Morgen|morgen|morgige") & (!str_detect(data$sentence, "Übermorgen|übermorgen|übermorgige")))
data$ubermorgen<- as.numeric(str_detect(data$sentence, "Übermorgen|übermorgen|übermorgige"))
data$gestern<- as.numeric(str_detect(data$sentence, "Gestern|gestern") & (!str_detect(data$sentence, "Vorgestern|vorgestern")))
data$vorgestern<- as.numeric(str_detect(data$sentence, "Vorgestern|vorgestern"))

#date of publication.
data$PD<- as.Date(data$date, tryFormats = c("%Y-%m-%d", "%d.%m.%Y"))
data$PDday<- weekdays(data$PD)
data$PDday<- as.factor(data$PDday)
levels(data$PDday)<- c("Donnerstag", "Montag", "Dienstag", "Freitag", "Mittwoch", "Samstag", "Sonntag")
data$PDday_num<- data$PDday
levels(data$PDday_num)<- c("4", "1", "2", "5", "3", "6", "7")
data$PDday_num<- as.numeric(as.character(data$PDday_num))

data$PDmonth<- month(data$PD)

#Date detection.
##the PE's date follows the publication's date.
data$date<- NULL

data <- data %>%
    mutate(date = case_when(
      (PDday_num < tag_only_one)  ~ as.Date(PD)-(7-(tag_only_one-PDday_num)),
      (PDday_num > tag_only_one) ~ as.Date(PD) - (PDday_num-tag_only_one),
      (PDday_num ==tag_only_one) ~ as.Date(PD)
))


data <- data %>%
    mutate(date = case_when(
      wochenende==1 & is.na(date) & (PDday_num < 6)  ~ as.Date(PD)-(6-(6-PDday_num)),
      wochenende==1 & is.na(date) & (PDday_num > 6)  ~ as.Date(PD) - (PDday_num-6),
      wochenende==1 & is.na(date) & (PDday_num ==6)  ~ as.Date(PD),
      !is.na(date) ~ date
))

data <- data %>%
    mutate(date = case_when(
      vorgestern==1 & is.na(date) ~ as.Date(PD)-2,
      ubermorgen==1 & is.na(date) ~ as.Date(PD)+2,
      morgen==1 & is.na(date) ~ as.Date(PD)+1,
      gestern==1 & is.na(date) ~ as.Date(PD)-1,
      heute==1 & is.na(date) ~ as.Date(PD),
      !is.na(date) ~ date
))

#date detection - full date.
data$datum_det<- str_extract(data$sentence, "([0-9]{1,2})\\. (Januar|Februar|März|April|Mai|Juni|Juli|August|September|Oktober|November|Dezember) ([2][0][0-9][0-9])")
data$datum_det<- str_remove(data$datum_det,"\\.")

data$datum_det<- month_to_date(data$datum_det)
data$datum_det<- str_replace_all(data$datum_det, " ", "-")
data$datum_det<- as.Date(data$datum_det, format="%d-%m-%Y")

## integrating datum_det to date.
data <- data %>%
    mutate(date = case_when(
      is.na(date) ~ datum_det,
      !is.na(date) ~ date
))


# date detection - year and month
data$datum_det<- NULL
data$datum_det[is.na(data$date)]<- str_extract(data$sentence[is.na(data$date)], "([0-9]{1,2})\\. (Januar|Februar|März|April|Mai|Juni|Juli|August|September|Oktober|November|Dezember)")
data$datum_det<- str_remove(data$datum_det,"\\.")
data$datum_det<-month_to_date(data$datum_det)
data$datum_det<- str_replace_all(data$datum_det, " ", "-")
data$datum_det[!is.na(data$datum_det)]<- paste0(data$datum_det[!is.na(data$datum_det)],"-", year(data$PD[!is.na(data$datum_det)]))
data$datum_det<- as.Date(data$datum_det, format="%d-%m-%Y")

## integrating datum_det to date.
data <- data %>%
    mutate(date = case_when(
      is.na(date) ~ datum_det,
      !is.na(date) ~ date
))

# date detection - month and day
data$datum_det<- NULL
data$datum_det[is.na(data$date)]<- str_extract(data$sentence[is.na(data$date)], "(im|Im) (Januar|Februar|März|April|Mai|Juni|Juli|August|September|Oktober|November|Dezember) ([2][0][0-9][0-9])")
data$datum_det<- str_remove(data$datum_det,"im |Im ")
data$datum_det<- month_to_date(data$datum_det)
data$datum_det<- str_replace_all(data$datum_det, " ", "-")
data$datum_det<- paste0("15-",data$datum_det)
data$datum_det<- as.Date(data$datum_det, format="%d-%m-%Y")

## integrating datum_det to date.
data <- data %>%
    mutate(date = case_when(
      is.na(date) ~ datum_det,
      !is.na(date) ~ date
))

# date detection - month
data$datum_det<- NULL
data$datum_det[is.na(data$date)]<- str_extract(data$sentence[is.na(data$date)], "(im|Im) (Januar|Februar|März|April|Mai|Juni|Juli|August|September|Oktober|November|Dezember)")
data$datum_det<- str_remove(data$datum_det,"im |Im ")
data$datum_det<- month_to_date(data$datum_det)
data$datum_det<- paste0("15-",data$datum_det,"-", year(data$PD))
data$datum_det<- as.Date(data$datum_det, format="%d-%m-%Y")

##integrating datum_det to date.
data <- data %>%
    mutate(date = case_when(
      is.na(date) ~ datum_det,
      !is.na(date) ~ date
))

##Aggregating to the article level.
df<- data %>%
  select(aid, date) %>%
  group_by(aid) %>%
  mutate(row = row_number()) %>%  
  pivot_wider(names_from = row, values_from = date, names_prefix = "date_") %>%
  ungroup()

cimke <- NULL
f <- NULL
for (i in 1:(dim(df)[2]-1)){
  f <- paste0("date", as.character(i))
  cimke <- c(cimke, f)
}
cimke <- c("aid", cimke, "nasum")

df <- as.data.frame(t(apply(df, 1, shift_left)), stringsAsFactors = FALSE)
df$nasum<- apply(df[,2:dim(df)[2]], 1, function(x) sum(!is.na(x)))

names(df)<- cimke

# df <- merge(df, d2[,c("date", "aid")], by="aid")
df <- left_join(df, d2 %>% rename("PD" = "date"), by = "aid")

df <- df %>%
  mutate(across(starts_with("date"), ~ ymd(.))) 
df$PD<- dmy(df$PD)
```

### Merge found dates in to one dataframe

```{r merging}
# If no date was detected in the text date==PD-1 (the day before publication).
# If one date was detected in the text date==date1 (date detected in the text).
# If multiple dates were detected in the text, the date is a) the most frequently occurring date or b) the date closest to the date of publication.

articles_date <- df %>%
  rowwise() %>%
  mutate(
    valid_dates = list(na.omit(c_across(starts_with("date")))),  
    date = case_when(
      nasum == 0 ~ PD-1,
      nasum == 1 ~ date1, 
      nasum > 1 & length(valid_dates) > 0 ~ {  
        mode_dates <- Mode(valid_dates) 
        if (length(mode_dates) == 1) {
          mode_dates
        } else {
          mode_dates[which.min(abs(difftime(mode_dates, PD, units = "days")))]
        }
      },
      TRUE ~ NA_Date_  
    )
  ) %>%
  ungroup() %>%
  select(-valid_dates) 

rm(list=setdiff(ls(), "articles_date"))

articles_date <- articles_date %>% 
  select(aid, date, PD) %>%
  rename(date_predicted=date)
```

### Save to file


```{r write}
write_delim(articles_date, file = "../data/taz2015_sample_date_predicted.csv", delim = ",")
```

## 2. Consolidate predictions

#### Read relevant articles
```{r}
articles_relevant <- read_delim("../data/taz2015_sample_relevant.csv") %>% 
  mutate(aid = as.character(aid))
```

### Claims

```{r claims}
##########
# Claims #
##########

# read predicted claims

claims <- read_delim("../data/taz2015_sample_claims_predicted.csv") %>% 
  mutate(aid = as.character(aid))

# count found claims per article
claims_count <- claims %>% 
  select(aid, claim_nr=prediction_claim) %>% 
  group_by(aid, claim_nr) %>% 
  mutate(claim_count = n()) %>% 
  slice(1) %>% 
  group_by(aid) %>% 
  arrange(desc(claim_count)) %>% 
  mutate(cid = row_number()) %>% 
  ungroup() %>% 
  arrange(aid)

# create dataframe with frequently found claims (>2)
hf_claims <- claims_count %>% 
  filter(claim_count > 2)

hf_aids <- hf_claims %>% 
  select(aid) %>% 
  unique() %>% 
  mutate(hf_select = 1)

# from the rest keep only the most often found claims
lf_claims <- claims_count %>% 
  left_join(., hf_aids, by = "aid") %>% 
  filter(is.na(hf_select)) %>% 
  group_by(aid) %>% 
  arrange(desc(claim_count), .by_group = T) %>% 
  slice(1) %>% 
  select(-hf_select)

# re-combine the two data frames and aggregate at article levle
claims_articles <- rbind(hf_claims, lf_claims) %>% 
  arrange(aid) %>% 
  pivot_wider(values_from = c(claim_nr, claim_count), names_from = cid)

# keep only the two most frequently found claims per article
claims_articles_main <- claims_articles %>% 
  select(aid, contains("1") | contains("2"))

# combine with article dataset
protest_articles_true_claims <- left_join(articles_relevant, claims_articles_main, by="aid")

```

### Forms

```{r}
##########
# Forms #
##########

forms <- read_delim("../data/taz2015_sample_forms_predicted.csv") %>% 
  mutate(aid = as.character(aid))

# count found forms per article
forms_count <- forms %>% 
  select(aid, form_nr=prediction) %>% 
  group_by(aid, form_nr) %>% 
  mutate(form_count = n()) %>% 
  slice(1) %>% 
  group_by(aid) %>% 
  arrange(desc(form_count)) %>% 
  mutate(cid = row_number()) %>% 
  ungroup() %>% 
  arrange(aid)

# create dataframe with frequently found forms (>2)
hf_forms <- forms_count %>% 
  filter(form_nr != 0) %>% 
  filter(form_count > 2)

hf_aids <- hf_forms %>% 
  select(aid) %>% 
  unique() %>% 
  mutate(hf_select = 1)

# from the rest keep only the most often found claims
lf_forms <- forms_count %>% 
  filter(form_nr != 0) %>% 
  left_join(., hf_aids, by = "aid") %>% 
  filter(is.na(hf_select)) %>% 
  group_by(aid) %>% 
  arrange(desc(form_count), .by_group = T) %>% 
  slice(1) %>% 
  select(-hf_select)

# re-combine the two data frames and aggregate at article level
forms_articles <- rbind(hf_forms, lf_forms) %>% 
  arrange(aid) %>% 
  pivot_wider(values_from = c(form_nr, form_count), names_from = cid)

# ignore "no form" in articles where forms are found
forms_articles <- forms_articles %>% 
  mutate(shift = if_else(is.na(form_nr_1) & !is.na(form_nr_2), 1, 0)) %>% 
  mutate(form_nr_1 = if_else(shift == 1, form_nr_2 ,form_nr_1),
         form_nr_2 = if_else(shift == 1, form_nr_3 ,form_nr_2),
         form_count_1 = if_else(shift == 1, form_count_2 ,form_count_1),
         form_count_2 = if_else(shift == 1, form_count_3 ,form_count_2))

  
# keep only the two most frequently found forms per article
forms_articles_main <- forms_articles %>% 
  select(aid, contains("1") | contains("2"))

# combine with article dataset
protest_articles_true_claims_forms <- left_join(protest_articles_true_claims, forms_articles_main, by="aid")

```

### Date
```{r}
articles_date <- read_delim("../data/taz2015_sample_date_predicted.csv", delim = ",") %>% 
  mutate(aid = as.character(aid))

protest_articles_true_claims_forms_date <- left_join(protest_articles_true_claims_forms, articles_date, by="aid")
```

### Location
```{r}
#########################################
#            Location                   #
#########################################

location <- read_delim("../data/taz2015_sample_location_predicted.csv") %>% 
  mutate(aid = as.character(aid))

location$pred_place <- gsub("\\[|'", "", location$pred_place)
location$pred_place <- gsub("\\]", "", location$pred_place)

location <- location %>% 
  mutate(pred_place = strsplit(as.character(pred_place), ", ")) %>% 
  unnest(pred_place)

location <- location %>% 
  group_by(aid, pred_place) %>% 
  mutate(loc_count = n()) %>% 
  slice(1) %>% 
  group_by(aid) %>% 
  arrange(desc(loc_count)) %>% 
  mutate(lid = row_number()) %>% 
  ungroup() %>% 
  arrange(aid) %>% 
  pivot_wider(values_from = c(pred_place, loc_count), names_from = lid)
```

### Put everything together

```{r}
#combine with the rest of the data
protests_all <- left_join(protest_articles_true_claims_forms_date, 
                      location %>% select(aid,
                                           ort1 = pred_place_1, 
                                           ort2 = pred_place_2, 
                                           ort1_count = loc_count_1,
                                           ort2_count = loc_count_2), by="aid")


# remove duplicates (same date, claim, form, and place)
all_events <- protests_all %>% 
  group_by(date, claim_nr_1, form_nr_1, ort1) %>% 
  slice(1)

write_delim(all_events, file = "../data/taz2015_sample_protestevents.csv", delim = ",")

all_events %>% 
  select(-c(text, atype, source, section, title, subtitle, author, keyword, pred_text)) %>% 
  head() %>% 
  kable()

```
